Análisis Exploratorio Inicial

setwd("~/Desktop/Datos")
datos<-read.table("SAheart.csv",sep = ";",dec='.',header=TRUE)
str(datos)
'data.frame':   462 obs. of  10 variables:
 $ sbp      : int  160 144 118 170 134 132 142 114 114 132 ...
 $ tobacco  : num  12 0.01 0.08 7.5 13.6 6.2 4.05 4.08 0 0 ...
 $ ldl      : num  5.73 4.41 3.48 6.41 3.5 6.47 3.38 4.59 3.83 5.8 ...
 $ adiposity: num  23.1 28.6 32.3 38 27.8 ...
 $ famhist  : chr  "Present" "Absent" "Present" "Present" ...
 $ typea    : int  49 55 52 51 60 62 59 62 49 69 ...
 $ obesity  : num  25.3 28.9 29.1 32 26 ...
 $ alcohol  : num  97.2 2.06 3.81 24.26 57.34 ...
 $ age      : int  52 63 46 58 49 45 38 58 29 53 ...
 $ chd      : chr  "Si" "Si" "No" "Si" ...
summary(datos)
      sbp           tobacco             ldl           adiposity    
 Min.   :101.0   Min.   : 0.0000   Min.   : 0.980   Min.   : 6.74  
 1st Qu.:124.0   1st Qu.: 0.0525   1st Qu.: 3.283   1st Qu.:19.77  
 Median :134.0   Median : 2.0000   Median : 4.340   Median :26.11  
 Mean   :138.3   Mean   : 3.6356   Mean   : 4.740   Mean   :25.41  
 3rd Qu.:148.0   3rd Qu.: 5.5000   3rd Qu.: 5.790   3rd Qu.:31.23  
 Max.   :218.0   Max.   :31.2000   Max.   :15.330   Max.   :42.49  
   famhist              typea         obesity         alcohol      
 Length:462         Min.   :13.0   Min.   :14.70   Min.   :  0.00  
 Class :character   1st Qu.:47.0   1st Qu.:22.98   1st Qu.:  0.51  
 Mode  :character   Median :53.0   Median :25.80   Median :  7.51  
                    Mean   :53.1   Mean   :26.04   Mean   : 17.04  
                    3rd Qu.:60.0   3rd Qu.:28.50   3rd Qu.: 23.89  
                    Max.   :78.0   Max.   :46.58   Max.   :147.19  
      age            chd           
 Min.   :15.00   Length:462        
 1st Qu.:31.00   Class :character  
 Median :45.00   Mode  :character  
 Mean   :42.82                     
 3rd Qu.:55.00                     
 Max.   :64.00                     
datos
mean(datos$age)
[1] 42.81602
var(datos$age)
[1] 213.4216
sd(datos$age)
[1] 14.60896
cor(datos$age,datos$obesity)
[1] 0.2917771
# cor(datos) Da un error ¿Porqué?
datos2<-datos[,-c(5,10)]
head(datos2)
cor(datos2)
                  sbp     tobacco         ldl   adiposity       typea
sbp        1.00000000  0.21224652  0.15829633  0.35650008 -0.05745431
tobacco    0.21224652  1.00000000  0.15890546  0.28664037 -0.01460788
ldl        0.15829633  0.15890546  1.00000000  0.44043175  0.04404758
adiposity  0.35650008  0.28664037  0.44043175  1.00000000 -0.04314364
typea     -0.05745431 -0.01460788  0.04404758 -0.04314364  1.00000000
obesity    0.23806661  0.12452941  0.33050586  0.71655625  0.07400610
alcohol    0.14009559  0.20081339 -0.03340340  0.10033013  0.03949794
age        0.38877060  0.45033016  0.31179923  0.62595442 -0.10260632
             obesity     alcohol        age
sbp       0.23806661  0.14009559  0.3887706
tobacco   0.12452941  0.20081339  0.4503302
ldl       0.33050586 -0.03340340  0.3117992
adiposity 0.71655625  0.10033013  0.6259544
typea     0.07400610  0.03949794 -0.1026063
obesity   1.00000000  0.05161957  0.2917771
alcohol   0.05161957  1.00000000  0.1011246
age       0.29177713  0.10112465  1.0000000
library(ellipse)

Attaching package: 'ellipse'
The following object is masked from 'package:graphics':

    pairs
matriz.correlacion<-cor(datos2)
plotcorr(matriz.correlacion,col = "red")

library(corrplot)
corrplot 0.92 loaded
corrplot(matriz.correlacion)

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(matriz.correlacion, method="shade", shade.col=NA, tl.col="black", tl.srt=45,
col=col(200), addCoef.col="black", order="AOE")
library("ggplot2")                    

library("GGally")                      
Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2
ggpairs(datos2) 

Código Disyuntivo en R - Variables “Dummy”

setwd("~/Desktop/Datos")
Datos <- read.csv("SAheart.csv",header=TRUE, sep=";", dec=".")
str(Datos)
'data.frame':   462 obs. of  10 variables:
 $ sbp      : int  160 144 118 170 134 132 142 114 114 132 ...
 $ tobacco  : num  12 0.01 0.08 7.5 13.6 6.2 4.05 4.08 0 0 ...
 $ ldl      : num  5.73 4.41 3.48 6.41 3.5 6.47 3.38 4.59 3.83 5.8 ...
 $ adiposity: num  23.1 28.6 32.3 38 27.8 ...
 $ famhist  : chr  "Present" "Absent" "Present" "Present" ...
 $ typea    : int  49 55 52 51 60 62 59 62 49 69 ...
 $ obesity  : num  25.3 28.9 29.1 32 26 ...
 $ alcohol  : num  97.2 2.06 3.81 24.26 57.34 ...
 $ age      : int  52 63 46 58 49 45 38 58 29 53 ...
 $ chd      : chr  "Si" "Si" "No" "Si" ...
dim(Datos)
[1] 462  10
head(Datos)
famhist.Present <- as.numeric(Datos$famhist == "Present")
print(famhist.Present)
  [1] 1 0 1 1 1 1 0 1 1 1 0 1 0 0 1 1 0 1 1 1 0 1 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0
 [38] 0 1 1 1 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 1 1 0 1 0 0 0 0 1
 [75] 0 0 0 1 0 1 1 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 0 1 0 0 0
[112] 1 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 1 1
[149] 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 1 1 0 0 0 0 1 1 1 1
[186] 1 0 0 1 1 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1
[223] 0 1 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 1 1 0 1 0 0 0 1 1 1 1 1 0
[260] 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0
[297] 0 0 0 1 1 0 0 1 0 1 1 0 0 1 1 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 1 0 0 0 0 1 1
[334] 0 0 1 1 0 1 0 0 1 1 0 0 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0 1 0 1 1 1 0 0 1 1 0
[371] 1 1 0 1 1 0 0 1 0 0 0 1 1 0 0 1 1 0 0 1 1 1 0 0 0 0 1 0 1 0 0 0 1 1 0 1 1
[408] 1 0 0 0 0 0 1 1 1 0 1 1 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0
[445] 0 1 0 1 1 1 0 0 1 1 0 0 1 0 0 0 0 1
famhist.Absent <- as.numeric(Datos$famhist == "Absent")
print(famhist.Absent)
  [1] 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 1 1 0 1 1 0 1 1 1 0 1 1 1 1
 [38] 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 1 0 1 1 1 1 1 0 1 0 0 1 1 0 0 1 0 1 1 1 1 0
 [75] 1 1 1 0 1 0 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 0 0 0 1 0 1 0 1 1 1 1 0 1 1 1
[112] 0 1 0 1 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 1 0 0
[149] 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 0 0 0 1 1 1 1 0 0 0 0
[186] 0 1 1 0 0 1 0 0 1 1 0 1 1 0 1 1 0 1 1 0 1 0 0 1 0 1 0 1 1 1 1 1 1 1 1 1 0
[223] 1 0 1 1 0 1 0 0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 0 0 1 0 1 1 1 0 0 0 0 0 1
[260] 0 1 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1
[297] 1 1 1 0 0 1 1 0 1 0 0 1 1 0 0 1 0 1 1 1 0 1 0 1 0 0 1 0 1 1 0 1 1 1 1 0 0
[334] 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 1 1 0 0 1
[371] 0 0 1 0 0 1 1 0 1 1 1 0 0 1 1 0 0 1 1 0 0 0 1 1 1 1 0 1 0 1 1 1 0 0 1 0 0
[408] 0 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 1 0 1 0 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1
[445] 1 0 1 0 0 0 1 1 0 0 1 1 0 1 1 1 1 0
chd.Si <- as.numeric(Datos$chd == "Si")
print(chd.Si)
  [1] 1 1 0 1 1 0 0 1 0 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 1 1 1 1 0 1 0
 [38] 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 [75] 0 0 0 1 1 0 1 1 1 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0
[112] 1 0 1 1 0 1 0 1 0 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 1
[149] 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1
[186] 1 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1
[223] 0 1 0 0 1 1 1 1 1 0 1 0 0 1 0 0 0 0 1 0 0 1 1 0 1 0 0 1 0 0 1 0 0 1 1 0 1
[260] 0 1 0 0 0 1 0 0 0 0 1 1 1 0 0 1 1 1 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0 1
[297] 0 0 1 0 0 1 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1
[334] 1 1 0 1 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0
[371] 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1
[408] 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[445] 0 0 1 0 0 1 0 0 0 1 1 1 0 0 1 0 0 1
chd.No <- as.numeric(Datos$chd == "No")
print(chd.No)
  [1] 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 1 0 1
 [38] 1 1 0 0 1 1 0 1 1 0 0 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1
 [75] 1 1 1 0 0 1 0 0 0 0 1 1 0 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1
[112] 0 1 0 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 1 0 0 1 1 0 1 1 1 1 0 0 1 1 1 1 1 0
[149] 0 0 1 1 1 1 0 0 1 1 1 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 0
[186] 0 1 1 1 0 0 0 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 0
[223] 1 0 1 1 0 0 0 0 0 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 0 1 1 0 1 1 0 0 1 0
[260] 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 0 0 0 1 1 1 0 1 0 1 0 1 1 1 1 0 1 1 1 0 1 0
[297] 1 1 0 1 1 0 1 0 1 1 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 0
[334] 0 0 1 0 1 1 1 1 0 0 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 1 1
[371] 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 1 0 0 0 1 1 1 1 1 0 0 1 1 1 0 0 1 1 0
[408] 0 1 1 0 1 0 0 1 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[445] 1 1 0 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0
Datos2<-Datos[,-c(5,10)]
Datos2<-cbind(Datos2,famhist.Present)
Datos2<-cbind(Datos2,famhist.Absent)
Datos2<-cbind(Datos2,chd.Si)
Datos2<-cbind(Datos2,chd.No)
str(Datos2)
'data.frame':   462 obs. of  12 variables:
 $ sbp            : int  160 144 118 170 134 132 142 114 114 132 ...
 $ tobacco        : num  12 0.01 0.08 7.5 13.6 6.2 4.05 4.08 0 0 ...
 $ ldl            : num  5.73 4.41 3.48 6.41 3.5 6.47 3.38 4.59 3.83 5.8 ...
 $ adiposity      : num  23.1 28.6 32.3 38 27.8 ...
 $ typea          : int  49 55 52 51 60 62 59 62 49 69 ...
 $ obesity        : num  25.3 28.9 29.1 32 26 ...
 $ alcohol        : num  97.2 2.06 3.81 24.26 57.34 ...
 $ age            : int  52 63 46 58 49 45 38 58 29 53 ...
 $ famhist.Present: num  1 0 1 1 1 1 0 1 1 1 ...
 $ famhist.Absent : num  0 1 0 0 0 0 1 0 0 0 ...
 $ chd.Si         : num  1 1 0 1 1 0 0 1 0 1 ...
 $ chd.No         : num  0 0 1 0 0 1 1 0 1 0 ...
dim(Datos2)
[1] 462  12
head(Datos2)

Código Disyuntivo en R - Variables “Dummy” con el paquete “dummies”

setwd("~/Desktop/Datos")
Datos <- read.csv("SAheart.csv",header=TRUE, sep=";", dec=".")
library(dummies)
dummies-1.5.6 provided by Decision Patterns
Datos2 <- dummy.data.frame(Datos, sep = ".")
str(Datos2)
'data.frame':   462 obs. of  12 variables:
 $ sbp            : int  160 144 118 170 134 132 142 114 114 132 ...
 $ tobacco        : num  12 0.01 0.08 7.5 13.6 6.2 4.05 4.08 0 0 ...
 $ ldl            : num  5.73 4.41 3.48 6.41 3.5 6.47 3.38 4.59 3.83 5.8 ...
 $ adiposity      : num  23.1 28.6 32.3 38 27.8 ...
 $ famhist.Absent : int  0 1 0 0 0 0 1 0 0 0 ...
 $ famhist.Present: int  1 0 1 1 1 1 0 1 1 1 ...
 $ typea          : int  49 55 52 51 60 62 59 62 49 69 ...
 $ obesity        : num  25.3 28.9 29.1 32 26 ...
 $ alcohol        : num  97.2 2.06 3.81 24.26 57.34 ...
 $ age            : int  52 63 46 58 49 45 38 58 29 53 ...
 $ chd.No         : int  0 0 1 0 0 1 1 0 1 0 ...
 $ chd.Si         : int  1 1 0 1 1 0 0 1 0 1 ...
 - attr(*, "dummies")=List of 2
  ..$ famhist: int [1:2] 5 6
  ..$ chd    : int [1:2] 11 12

Buscando Atícos

setwd("~/Desktop/Datos")
datos=read.csv("EjemploClientes.csv",sep = ";",dec=',',header=TRUE,row.names=1)
suppressMessages(suppressWarnings(library(FactoMineR)))
suppressMessages(suppressWarnings(library(car)))
Atipicos<-(Boxplot(~Edad, data=datos, id.method="y",col="Blue"))

Atipicos
[1] " Damaris "
Atipicos<-Boxplot(~Antiguedad, data=datos, id.method="y",col="Red")

Atipicos
[1] " Arnoldo "
Atipicos<-Boxplot(~Espacios.Parqueo, data=datos, id.method="y",col="Red")

Atipicos
[1] " Griselda " " Diana "   

Regresión Lineal Clásica

Análisis Preliminar

Ejemplo: BostonCasas.csv

setwd("~/Desktop/Datos")
Datos <- read.csv("BostonCasas.csv",row.names=1,header=TRUE, sep=",", dec=".")
dim(Datos)
[1] 506  14
str(Datos)
'data.frame':   506 obs. of  14 variables:
 $ Crimin   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
 $ ResidM25 : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
 $ NegocMin : chr  "Baja" "Baja" "Baja" "Baja" ...
 $ LimitaRC : chr  "No" "No" "No" "No" ...
 $ OxNitr   : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
 $ Habitac  : num  6.58 6.42 7.18 7 7.15 ...
 $ VivA1940 : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
 $ Distanc  : num  4.09 4.97 4.97 6.06 6.06 ...
 $ AccAutop : int  1 2 2 3 3 3 5 5 5 5 ...
 $ Impuesto : int  296 242 242 222 222 222 311 311 311 311 ...
 $ AlumProf : num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
 $ Afroame  : num  397 397 393 395 397 ...
 $ ClaseBaja: num  4.98 9.14 4.03 2.94 5.33 ...
 $ ValorProm: num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
Datos
# Elimino variables categóricas
Datos2 <- Datos[,-c(3,4)]
dim(Datos2)
[1] 506  12
str(Datos)
'data.frame':   506 obs. of  14 variables:
 $ Crimin   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
 $ ResidM25 : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
 $ NegocMin : chr  "Baja" "Baja" "Baja" "Baja" ...
 $ LimitaRC : chr  "No" "No" "No" "No" ...
 $ OxNitr   : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
 $ Habitac  : num  6.58 6.42 7.18 7 7.15 ...
 $ VivA1940 : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
 $ Distanc  : num  4.09 4.97 4.97 6.06 6.06 ...
 $ AccAutop : int  1 2 2 3 3 3 5 5 5 5 ...
 $ Impuesto : int  296 242 242 222 222 222 311 311 311 311 ...
 $ AlumProf : num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
 $ Afroame  : num  397 397 393 395 397 ...
 $ ClaseBaja: num  4.98 9.14 4.03 2.94 5.33 ...
 $ ValorProm: num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
Datos2
library(corrplot)
matriz.correlacion<-cor(Datos2)
corrplot(matriz.correlacion)

# Funciones parav el gráfico de pares

lower <- function(data, mapping){
  p <- ggplot(data = data, mapping = mapping) + 
    geom_point(size = 1,col = "dodgerblue3") +
    geom_smooth(method = lm, size = 0.4, color = "red", se = FALSE)
  return(p)
}

diag <- function(data, mapping){
  p <- ggplot(data = data, mapping = mapping) + 
    geom_histogram(bins = 30, col = "black", fill = "#F8766D", aes(y=..density..)) +
    geom_density()
  return(p)
}

library(tidyverse) # Para usar la función select_if
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.0     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ lubridate 1.9.2     ✔ tibble    3.1.8
✔ purrr     1.0.1     ✔ tidyr     1.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter()     masks stats::filter()
✖ dplyr::group_rows() masks kableExtra::group_rows()
✖ dplyr::lag()        masks stats::lag()
✖ dplyr::recode()     masks car::recode()
✖ purrr::some()       masks car::some()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
ggpairs(select_if(Datos2, is.numeric) %>% dplyr::select(-ValorProm, everything()), # el select posiciona la variable a predecir al final, para que se grafique bien en caso de que solo haya 1 variable a predecir
        lower = list(continuous = lower), diag = list(continuous = diag))
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

Regresión Lineal Simple

Ejemplo 1: COVID-19

corona <- c(1,5,9,9,13,22,23,26,27,35,41,50,69,87,113,117,134,158,177,201,231)
corona
 [1]   1   5   9   9  13  22  23  26  27  35  41  50  69  87 113 117 134 158 177
[20] 201 231
barplot(corona,col = "green")

casos.nuevos.dia<-diff(corona)
casos.nuevos.dia
 [1]  4  4  0  4  9  1  3  1  8  6  9 19 18 26  4 17 24 19 24 30
length(casos.nuevos.dia)
[1] 20
barplot(casos.nuevos.dia,col = "blue")

eje.x<-seq(1:(length(corona)-1))
eje.x
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
datos<-rbind(eje.x,casos.nuevos.dia)
datos
                 [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
eje.x               1    2    3    4    5    6    7    8    9    10    11    12
casos.nuevos.dia    4    4    0    4    9    1    3    1    8     6     9    19
                 [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
eje.x               13    14    15    16    17    18    19    20
casos.nuevos.dia    18    26     4    17    24    19    24    30
datos<-t(datos)
datos<-as.data.frame(datos)
datos
modelo.lm <- lm(casos.nuevos.dia~eje.x,data=datos)
modelo.lm

Call:
lm(formula = casos.nuevos.dia ~ eje.x, data = datos)

Coefficients:
(Intercept)        eje.x  
     -2.521        1.335  
summary(modelo.lm)

Call:
lm(formula = casos.nuevos.dia ~ eje.x, data = datos)

Residuals:
     Min       1Q   Median       3Q      Max 
-13.5090  -3.3323  -0.1677   4.0989   9.8263 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -2.5211     2.6123  -0.965    0.347    
eje.x         1.3353     0.2181   6.123 8.76e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.623 on 18 degrees of freedom
Multiple R-squared:  0.6757,    Adjusted R-squared:  0.6576 
F-statistic:  37.5 on 1 and 18 DF,  p-value: 8.76e-06
plot(datos$eje.x,datos$casos.nuevos.dia,col="red")
abline(modelo.lm,col="blue")

pred<-predict(modelo.lm,data.frame(eje.x=c(21,22,23)),interval="confidence")
pred
       fit      lwr      upr
1 25.52105 20.03287 31.00924
2 26.85639 20.96250 32.75028
3 28.19173 21.88495 34.49850
# El 4 es un dato atípico
casos.nuevos.dia
 [1]  4  4  0  4  9  1  3  1  8  6  9 19 18 26  4 17 24 19 24 30
casos.nuevos.dia[15]<-mean(casos.nuevos.dia)
casos.nuevos.dia
 [1]  4.0  4.0  0.0  4.0  9.0  1.0  3.0  1.0  8.0  6.0  9.0 19.0 18.0 26.0 11.5
[16] 17.0 24.0 19.0 24.0 30.0
datos<-rbind(eje.x,casos.nuevos.dia)
datos<-t(datos)
datos<-as.data.frame(datos)

modelo.lm <- lm(casos.nuevos.dia~eje.x,data=datos)
modelo.lm

Call:
lm(formula = casos.nuevos.dia ~ eje.x, data = datos)

Coefficients:
(Intercept)        eje.x  
     -2.679        1.386  
summary(modelo.lm)

Call:
lm(formula = casos.nuevos.dia ~ eje.x, data = datos)

Residuals:
   Min     1Q Median     3Q    Max 
-7.410 -3.682 -0.568  4.117  9.274 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -2.6789     2.2394  -1.196    0.247    
eje.x         1.3861     0.1869   7.415 7.11e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.821 on 18 degrees of freedom
Multiple R-squared:  0.7533,    Adjusted R-squared:  0.7396 
F-statistic: 54.98 on 1 and 18 DF,  p-value: 7.107e-07
plot(datos$eje.x,datos$casos.nuevos.dia,col="red")
abline(modelo.lm,col="blue")

pred<-predict(modelo.lm,data.frame(eje.x=c(21,22,23)),interval="confidence")
pred
       fit      lwr      upr
1 26.42895 21.72419 31.13371
2 27.81504 22.76249 32.86758
3 29.20113 23.79463 34.60763

Ejemplo 2: Boston Casas

modelo.lm <- lm(ValorProm~ClaseBaja, data = Datos2)
modelo.lm

Call:
lm(formula = ValorProm ~ ClaseBaja, data = Datos2)

Coefficients:
(Intercept)    ClaseBaja  
      34.55        -0.95  
summary(modelo.lm)

Call:
lm(formula = ValorProm ~ ClaseBaja, data = Datos2)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.168  -3.990  -1.318   2.034  24.500 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 34.55384    0.56263   61.41   <2e-16 ***
ClaseBaja   -0.95005    0.03873  -24.53   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.216 on 504 degrees of freedom
Multiple R-squared:  0.5441,    Adjusted R-squared:  0.5432 
F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16
plot(Datos2$ClaseBaja,Datos2$ValorProm,col="red")
abline(modelo.lm,col="blue")

pred<-predict(modelo.lm,data.frame(ClaseBaja=c(8,10,15)),interval="confidence")
pred
       fit      lwr      upr
1 26.95345 26.30529 27.60161
2 25.05335 24.47413 25.63256
3 20.30310 19.73159 20.87461

Regresión sobre la Tabla Completa

Enfoque estadístico - Bondad de Ajuste

Se puede usar para predecir pero no para medir el error

# Acepta variales categóricas sin problema
modelo.lm <- lm(ValorProm~., data = Datos2)
modelo.lm

Call:
lm(formula = ValorProm ~ ., data = Datos2)

Coefficients:
(Intercept)       Crimin     ResidM25       OxNitr      Habitac     VivA1940  
  36.711949    -0.114059     0.046099   -16.690209     3.825925     0.002839  
    Distanc     AccAutop     Impuesto     AlumProf      Afroame    ClaseBaja  
  -1.513485     0.316561    -0.012696    -0.980534     0.009692    -0.531803  
summary(modelo.lm)

Call:
lm(formula = ValorProm ~ ., data = Datos2)

Residuals:
     Min       1Q   Median       3Q      Max 
-13.3346  -2.7771  -0.5547   1.8853  26.3736 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  36.711949   5.136151   7.148 3.19e-12 ***
Crimin       -0.114059   0.033064  -3.450 0.000609 ***
ResidM25      0.046099   0.013762   3.350 0.000871 ***
OxNitr      -16.690209   3.707108  -4.502 8.40e-06 ***
Habitac       3.825925   0.419477   9.121  < 2e-16 ***
VivA1940      0.002839   0.013301   0.213 0.831059    
Distanc      -1.513485   0.196417  -7.705 7.20e-14 ***
AccAutop      0.316561   0.064028   4.944 1.05e-06 ***
Impuesto     -0.012696   0.003396  -3.738 0.000207 ***
AlumProf     -0.980534   0.130351  -7.522 2.56e-13 ***
Afroame       0.009692   0.002703   3.585 0.000371 ***
ClaseBaja    -0.531803   0.050915 -10.445  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.784 on 494 degrees of freedom
Multiple R-squared:  0.7353,    Adjusted R-squared:  0.7294 
F-statistic: 124.7 on 11 and 494 DF,  p-value: < 2.2e-16

Regresión sobre la Tabla Training

Enfoque Training-Testing

Se usa para medir de mejor manera el error

Luego vamos a usar Validación Cruzada pare medir aún mejor el error

Funciones para medir el error y funciones Auxiliares

  • Residual Sum of Square :

\(\huge RSS = \sum_{i=1}^{n}{(y_i-\hat{y}_i)^2}\)

# Residual Sum of Square (RSS)
RSS <- function(Pred,Real) {
  ss <- sum((Real-Pred)^2)
  return(ss)
}
  • Residual Standard Error (RSE) :
\(\huge RSE = \sqrt{\frac{\sum_{i = 1}^n{(y_i - \hat{y}_i)^2}}{(n-p-1)}}\)
# NumPred es el número total de predictores por eso se resta 1 (que es realidad sumar 1)
RSE<-function(Pred,Real,NumPred) {
  N<-length(Real)-NumPred-1  # <- length(Real)-(NumPred+1)
  ss<-sqrt((1/N)*RSS(Pred,Real))
  return(ss)
}
  • Mean Squared Error

\(\huge RSS = \frac{1}{n}\sum_{i=1}^n{(y_i-\hat{y}_i)^2}\)

MSE <- function(Pred,Real) {
  N<-length(Real)
  ss<-(1/N)*RSS(Pred,Real)
  return(ss)
}
  • Error Relativo

\(\huge error.relativo = \frac{\sum_{i=1}^n{|y_i-\hat{y}_i|}}{\sum_{i = 1}^n|y_i|}\)

error.relativo <- function(Pred,Real) {
  ss<-sum(abs(Real-Pred))/sum(abs(Real))
  return(ss)
}
# Funciones para desplegar precisión
indices.precision <- function(real, prediccion,cantidad.variables.predictoras) {
  return(list(error.cuadratico = MSE(prediccion,real),
              raiz.error.cuadratico = RSE(prediccion,real,cantidad.variables.predictoras),
              error.relativo = error.relativo(prediccion,real),
              correlacion = as.numeric(cor(prediccion,real))))
}


# Gráfico de dispersión entre el valor real de la variable a predecir y la predicción del modelo.
plot.real.prediccion <- function(real, prediccion, modelo = "") {
  g <- ggplot(data = data.frame(Real = real, Prediccion = as.numeric(prediccion)), mapping = aes(x = Real, y = Prediccion)) +
    geom_point(size = 1, col = "dodgerblue3") +
    labs(title = paste0("Real vs Predicción", ifelse(modelo == "", "", paste(", con", modelo))),
         x = "Real",
         y = "Predicción")
  return(g)
}

Ejemplo: BostonCasas.csv

setwd("~/Desktop/Datos")
Datos <- read.csv("BostonCasas.csv",row.names=1,header=TRUE, sep=",", dec=".")
# Elimino variables categóricas
Datos2 <- Datos[,-c(3,4)]
# División Training y Testing, 70% para entrenamiento y 30% para testing
numero.predictoras <- dim(Datos2)[2] - 1
numero.filas <- dim(Datos2)[1]
muestra <- sample(1:numero.filas,numero.filas*0.3)
datos.aprendizaje <- Datos2[-muestra, ]
datos.prueba <- Datos2[muestra, ]
datos.aprendizaje
datos.prueba
# Calcula el modelo usando solo los datos de training
modelo.lm <- lm(ValorProm~., data = datos.aprendizaje)
modelo.lm

Call:
lm(formula = ValorProm ~ ., data = datos.aprendizaje)

Coefficients:
(Intercept)       Crimin     ResidM25       OxNitr      Habitac     VivA1940  
  29.127664    -0.115590     0.031764   -12.277622     4.443833    -0.006966  
    Distanc     AccAutop     Impuesto     AlumProf      Afroame    ClaseBaja  
  -1.288157     0.314014    -0.011023    -1.006278     0.012309    -0.518149  
# Hace la Predicción
prediccion <- predict(modelo.lm, datos.prueba)
# Medición de precisión
pre.lm <- indices.precision(datos.prueba$ValorProm, prediccion,numero.predictoras)
pre.lm
$error.cuadratico
[1] 25.65676

$raiz.error.cuadratico
[1] 5.279369

$error.relativo
[1] 0.163729

$correlacion
[1] 0.8457587
# Gráfico real vs predicción, con curva de mejor ajuste lineal
g <- plot.real.prediccion(datos.prueba$ValorProm, prediccion, modelo = "Regresión Lineal")
g + geom_smooth(method = lm, size = 0.4, color = "red", se = FALSE)
`geom_smooth()` using formula = 'y ~ x'

OBSERVACIÓN Nótese que el “Residual standard error: 4.784” del modelo que se calcula con todos los datos (bondad de ajuste) es menor que “Residual Standard Error (RSE)” que se calcula en los datos testing “raiz.error.cuadratico” = 5.2793686.

Entonces el error se debe medir usando Training-Testing o Validación Cruzada.

Para predecir nuevos individuos se debe usar el modelo con todos los datos.

Ejemplo: BostonCasas.csv con Códigos Disyuntivos Completos - Variables Dummies

library(dummies)
setwd("~/Desktop/Datos")
Datos <- read.csv("BostonCasas.csv",row.names=1,header=TRUE, sep=",", dec=".")
# Convierto las variables categóricas en Dummies
Datos2 <- dummy.data.frame(Datos, sep = ".")
# División Training y Testing, 70% para entrenamiento y 30% para testing
numero.predictoras <- dim(Datos2)[2] - 1
numero.filas <- dim(Datos2)[1]
muestra <- sample(1:numero.filas,numero.filas*0.3)
datos.aprendizaje <- Datos2[-muestra, ]
datos.prueba <- Datos2[muestra, ]
datos.aprendizaje
datos.prueba
# Calcula el modelo usando solo los datos de training
modelo.lm <- lm(ValorProm~., data = datos.aprendizaje)
modelo.lm

Call:
lm(formula = ValorProm ~ ., data = datos.aprendizaje)

Coefficients:
   (Intercept)          Crimin        ResidM25   NegocMin.Alta   NegocMin.Baja  
      32.31317        -0.15832         0.04641         3.08147         1.66204  
NegocMin.Media     LimitaRC.No     LimitaRC.Si          OxNitr         Habitac  
            NA        -2.75387              NA       -18.55975         4.24971  
      VivA1940         Distanc        AccAutop        Impuesto        AlumProf  
      -0.01286        -1.51565         0.43668        -0.01224        -0.82180  
       Afroame       ClaseBaja  
       0.01067        -0.48175  
# Hace la Predicción
prediccion <- predict(modelo.lm, datos.prueba)
# Medición de precisión
pre.lm <- indices.precision(datos.prueba$ValorProm, prediccion,numero.predictoras)
pre.lm
$error.cuadratico
[1] 31.45123

$raiz.error.cuadratico
[1] 5.953261

$error.relativo
[1] 0.1674544

$correlacion
[1] 0.8308064
# Gráfico real vs predicción, con curva de mejor ajuste lineal
g <- plot.real.prediccion(datos.prueba$ValorProm, prediccion, modelo = "Regresión Lineal")
g + geom_smooth(method = lm, size = 0.4, color = "red", se = FALSE)
`geom_smooth()` using formula = 'y ~ x'

Ejemplo: Cáncer de prostata

Estos datos provienen de un estudio que examinó la correlación entre el nivel de antígeno prostático específico y una serie de medidas clínicas en hombres que estaban a punto de recibir una prostatectomía radical. Es un data.frame con 97 filas y 9 columnas.

Este contiene las siguiente variables :

  • lcavol = log(volumen del cáncer)
  • lweigth = log(peso de la próstata)
  • age = edad en años
  • lbph = log(cantidad de hiperplasia prostática benigna)
  • svi = invasión de vesículas seminales
  • lcp = log(penetración capsular)
  • gleason = puntaje de gleason
  • pgg45 = porcentaje de puntajes de gleason entre 4 y 5
  • lpsa = log(antígeno prostático específico)

1. Lectura de los datos

setwd("~/Desktop/Datos")
datos.prostata <- read.csv("prostate_data.csv", header=TRUE, dec=",", sep=";")
head(datos.prostata)

2. Correlación entre las variables y gráfico “pairs”

correlaciones <- cor(datos.prostata)
correlaciones
           lcavol    lweight       age         lbph         svi          lcp
lcavol  1.0000000 0.28052138 0.2249999  0.027349703  0.53884500  0.675310484
lweight 0.2805214 1.00000000 0.3479691  0.442264399  0.15538490  0.164537142
age     0.2249999 0.34796911 1.0000000  0.350185896  0.11765804  0.127667752
lbph    0.0273497 0.44226440 0.3501859  1.000000000 -0.08584324 -0.006999431
svi     0.5388450 0.15538490 0.1176580 -0.085843238  1.00000000  0.673111185
lcp     0.6753105 0.16453714 0.1276678 -0.006999431  0.67311118  1.000000000
gleason 0.4324171 0.05688209 0.2688916  0.077820447  0.32041222  0.514830063
pgg45   0.4336522 0.10735379 0.2761124  0.078460018  0.45764762  0.631528246
lpsa    0.7344603 0.43331938 0.1695928  0.179809404  0.56621822  0.548813175
           gleason      pgg45      lpsa
lcavol  0.43241706 0.43365225 0.7344603
lweight 0.05688209 0.10735379 0.4333194
age     0.26889160 0.27611245 0.1695928
lbph    0.07782045 0.07846002 0.1798094
svi     0.32041222 0.45764762 0.5662182
lcp     0.51483006 0.63152825 0.5488132
gleason 1.00000000 0.75190451 0.3689868
pgg45   0.75190451 1.00000000 0.4223159
lpsa    0.36898681 0.42231586 1.0000000
corrplot(correlaciones)

ggpairs(select_if(datos.prostata, is.numeric) %>% dplyr::select(-lpsa, everything()), # el select posiciona la variable a predecir al final, para que se grafique bien en caso de que solo haya 1 variable a predecir
        lower = list(continuous = lower), diag = list(continuous = diag))
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

Podemos ver que las variables que mejor se correlaciona con lpsa es lcvol

3. Modelado

Para realizar el modelo debemos indicarle a la función lm una formula, en R se puede definir una formula de la siguiente manera: y ~ x donde y sería lpsa y x lcavol.

modelo <- lm(lpsa ~ lcavol, data = datos.prostata)
modelo

Call:
lm(formula = lpsa ~ lcavol, data = datos.prostata)

Coefficients:
(Intercept)       lcavol  
     1.5073       0.7193  
coef(modelo)
(Intercept)      lcavol 
  1.5072975   0.7193204 

Podemos ver que el modelo nos retorna dos valores que llamaremos \(\beta_0\) y \(\beta_1\), valores que podemos entender como la pendiente y la cordenada de origen en la ecuación de una recta.

4. Resultados del modelo (a mano)

Veamos como un modelo linal de una sola variable en una operación relativamente sencilla en el que debemos sumar a los valores de x el valor de \(\beta_0\) y multiplicarlos por el valor de \(\beta_1\), siguiente la siguiente formula : \(\large y = \beta_0 + \beta_1x\).

x <- datos.prostata$lcavol
b0 <- coef(modelo)[1]
b0
(Intercept) 
   1.507297 
b1 <- coef(modelo)[2]
b1
   lcavol 
0.7193204 
b0 + b1 * x
 [1] 1.0902222 0.7921115 1.1398502 0.6412553 2.0478064 0.7521390 2.0375546
 [8] 2.0058924 0.9487245 1.6678092 1.6904668 0.5383199 2.6678705 2.5697687
[15] 2.3747769 2.6158846 1.2084087 3.1534522 1.1029539 1.6384451 2.3326474
[22] 2.9885499 1.1154641 2.7889172 1.7844246 2.5480958 1.8761819 1.2192258
[29] 2.2555897 3.2406036 1.7124325 1.6384451 2.4246919 1.5144549 1.5000680
[36] 2.4484079 2.5309683 1.8363325 3.4213792 2.0809606 1.9536908 2.5447028
[43] 1.9260970 2.7816144 2.5763080 2.7041934 3.4694976 2.3439756 2.7630262
[50] 2.3854653 2.2927402 2.7014636 1.8761819 3.0373211 3.7757393 2.4186387
[57] 2.2083181 1.8408708 1.8974024 2.2706809 1.8363325 2.9440807 3.5039214
[64] 2.9709027 2.9985723 2.5565090 2.9623900 3.0886047 1.1862740 2.3661102
[71] 2.8481683 2.3417242 2.3812090 2.8300997 3.6647020 3.7667767 2.9537752
[78] 3.3326860 3.4122738 3.5066055 2.5631694 3.3154215 3.3868864 3.4333433
[85] 2.6311250 3.8831043 2.9633408 2.7529126 3.5268570 2.6311250 3.8425646
[92] 3.3292661 3.5431668 4.2558233 3.5986836 3.5807842 4.0047537

5. Visualización de la recta de mejor ajuste

plot(datos.prostata$lcavol, datos.prostata$lpsa, col = "black",
     main = "Recta de mejor ajuste para las variables lpsa y lcavol",
     xlab = "log(volumen de cáncer)",
     ylab  = "log(antígeno prostático específico)",
     pch = 19)
grid(6, 6, lwd = 2)
abline(modelo, col = "red")

6. Predicción con nuevos datos

Para realizar una predicción utilizando datos nuevos debemos definir un nuevo data.frame con los valores de la variable predictora.

nuevos.datos <- data.frame(lcavol=c(0.70, 0.41, -0.23))
nuevos.datos
prediccion <- predict(modelo, nuevos.datos , interval = "confidence")
prediccion
       fit      lwr      upr
1 2.010822 1.829324 2.192319
2 1.802219 1.598768 2.005670
3 1.341854 1.075486 1.608221

Como resultado tendremos los valores que el modelo estima para lpsa datos los nuevos valores de lcavol.

plot(x = nuevos.datos$lcavol, y = prediccion[,1]
     , col = "black",
     main = "Predicción de valores de lpsa dados nuevos valores de lcavol",
     xlab = "log(volumen de cáncer)",
     ylab  = "log(antígeno prostático específico)",
     pch = 19)
grid(6, 6, lwd = 2)

modelo <- lm(lpsa ~ age, data = datos.prostata)
modelo

Call:
lm(formula = lpsa ~ age, data = datos.prostata)

Coefficients:
(Intercept)          age  
    0.79906      0.02629  
coef(modelo)
(Intercept)         age 
 0.79906018  0.02629454 
plot(x = datos.prostata$age, 
     y = datos.prostata$lpsa, 
     col = "black",
     main = "Recta de mejor ajuste para las variables lpsa y age",
     xlab = "Edad en años",
     ylab  = "log(antígeno prostático específico)",
     pch = 19)
grid(6, 6, lwd = 2)
abline(modelo,col="red")

nuevos.datos <-  data.frame(age=c(42, 55, 71, 77))
prediccion <- predict(modelo,nuevos.datos,interval = "confidence")
prediccion
       fit      lwr      upr
1 1.903431 1.184926 2.621936
2 2.245260 1.885712 2.604808
3 2.665973 2.345921 2.986024
4 2.823740 2.354460 3.293020
plot(x = nuevos.datos$age, y = prediccion[,1]
     , col = "black",
     main = "Predicción de valores de lpsa dados nuevos valores de age",
     xlab = "Edad en años",
     ylab  = "log(antígeno prostático específico)",
     pch = 19)
grid(6, 6, lwd = 2)

Regresión Múltiple

Para estos ejemplo utilizaremos nuevamente los datos de cáncer de prostáta ya presentados en el ejemplo anterior.

setwd("~/Desktop/Datos")
datos.prostata <- read.csv("prostate_data.csv", header=TRUE, dec=",", sep=";")
dim(datos.prostata)
[1] 97  9
datos.prostata
datos.prostata.nuevos <- read.csv("newprostate_data.csv", header=TRUE, dec=",", sep=";")
dim(datos.prostata.nuevos)
[1] 10  9
datos.prostata.nuevos

1. Creación de un modelo con todas la variables

modelo <- lm(lpsa ~ ., data = datos.prostata)
modelo 

Call:
lm(formula = lpsa ~ ., data = datos.prostata)

Coefficients:
(Intercept)       lcavol      lweight          age         lbph          svi  
   0.181561     0.564341     0.622020    -0.021248     0.096713     0.761673  
        lcp      gleason        pgg45  
  -0.106051     0.049228     0.004458  
summary(modelo)

Call:
lm(formula = lpsa ~ ., data = datos.prostata)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.76644 -0.35510 -0.00328  0.38087  1.55770 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.181561   1.320568   0.137  0.89096    
lcavol       0.564341   0.087833   6.425 6.55e-09 ***
lweight      0.622020   0.200897   3.096  0.00263 ** 
age         -0.021248   0.011084  -1.917  0.05848 .  
lbph         0.096713   0.057913   1.670  0.09848 .  
svi          0.761673   0.241176   3.158  0.00218 ** 
lcp         -0.106051   0.089868  -1.180  0.24115    
gleason      0.049228   0.155341   0.317  0.75207    
pgg45        0.004458   0.004365   1.021  0.31000    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.6995 on 88 degrees of freedom
Multiple R-squared:  0.6634,    Adjusted R-squared:  0.6328 
F-statistic: 21.68 on 8 and 88 DF,  p-value: < 2.2e-16

1.1 Midiendo el error con Training-Testing

setwd("~/Desktop/Datos")
datos.prostata <- read.csv("prostate_data.csv", header=TRUE, dec=",", sep=";")
# División Training y Testing, 70% para entrenamiento y 30% para testing
numero.predictoras <- dim(datos.prostata)[2] - 1
numero.filas <- dim(datos.prostata)[1]
muestra <- sample(1:numero.filas,numero.filas*0.3)
datos.prostata.aprendizaje <- datos.prostata[muestra, ]
datos.prostata.prueba <- datos.prostata[-muestra, ]
datos.prostata.aprendizaje
datos.prostata.prueba
# Calcula el modelo usando solo los datos de training
modelo.lm <- lm(lpsa~., data = datos.prostata.aprendizaje)
modelo.lm

Call:
lm(formula = lpsa ~ ., data = datos.prostata.aprendizaje)

Coefficients:
(Intercept)       lcavol      lweight          age         lbph          svi  
  0.8750968    0.4166023    0.3497793   -0.0508304    0.2269494    1.5309739  
        lcp      gleason        pgg45  
  0.1773820    0.3734952    0.0005517  
# Hace la Predicción
prediccion <- predict(modelo.lm, datos.prostata.prueba)
# Medición de precisión
pre.lm <- indices.precision(datos.prostata.prueba$lpsa, prediccion,numero.predictoras)
pre.lm
$error.cuadratico
[1] 1.137701

$raiz.error.cuadratico
[1] 1.145097

$error.relativo
[1] 0.3297569

$correlacion
[1] 0.6730252
# Gráfico real vs predicción, con curva de mejor ajuste lineal
g <- plot.real.prediccion(datos.prostata.prueba$lpsa, prediccion, modelo = "Regresión Lineal")
g + geom_smooth(method = lm, size = 0.4, color = "red", se = FALSE)
`geom_smooth()` using formula = 'y ~ x'

1.2 Predicción, uso el modelo con todos los datos

prediccion <- predict(modelo,datos.prostata.nuevos, interval="confidence")
prediccion
         fit       lwr      upr
1  2.0738156 1.6676943 2.479937
2  2.4074603 1.9332538 2.881667
3  0.7555028 0.2847731 1.226232
4  2.0666468 1.4775007 2.655793
5  1.4300652 0.6443919 2.215738
6  1.6846533 1.3283367 2.040970
7  3.5046107 3.0312808 3.977941
8  2.5671972 2.0767038 3.057691
9  4.4720592 3.8658599 5.078258
10 4.1548343 3.6786098 4.631059

2. Modelo con algunas variables seleccionadas por nosotros.

modelo <- lm(lpsa ~ lcavol + lweight  + age, data = datos.prostata)
modelo

Call:
lm(formula = lpsa ~ lcavol + lweight + age, data = datos.prostata)

Coefficients:
(Intercept)       lcavol      lweight          age  
   -0.30084      0.66191      0.72849     -0.01187  

2.1 Prediccion

prediccion <- predict(modelo, datos.prostata.nuevos, interval = "confidence")
prediccion
         fit       lwr      upr
1  2.1319974 1.8171223 2.446872
2  2.6319604 2.4543466 2.809574
3  0.5687773 0.1292658 1.008289
4  2.1260295 1.8442754 2.407784
5  0.9913164 0.4357991 1.546834
6  1.8741506 1.6541980 2.094103
7  3.7620126 3.4022130 4.121812
8  2.8544516 2.4309001 3.278003
9  4.3463853 3.7516738 4.941097
10 4.1521519 3.8288800 4.475424

¿Cómo calcular el error?

Ahora calculemos el error para varios modelos.

modelo <- lm(lpsa~lcavol+svi,data=datos.prostata)

# Usando newprostate_data.csv para testing
pred<-predict(modelo,datos.prostata.nuevos)

# Se usa NumVar=2 porque tenemos 2 predictores
pre.lm <- indices.precision(datos.prostata.nuevos$lpsa,pred,2)
pre.lm
$error.cuadratico
[1] 0.935128

$raiz.error.cuadratico
[1] 1.15581

$error.relativo
[1] 0.267384

$correlacion
[1] 0.8068148
modelo<-lm(lpsa~.,data=datos.prostata)

# Usando newprostate_data.csv para testing
pred<-predict(modelo,datos.prostata.nuevos)
# dim(PDatos)[2]-1 = Número de Predictores que es lo que recibe la función
pre.lm <- indices.precision(datos.prostata.nuevos$lpsa,pred,dim(datos.prostata.nuevos)[2]-1)
pre.lm
$error.cuadratico
[1] 1.04659

$raiz.error.cuadratico
[1] 3.235104

$error.relativo
[1] 0.2930132

$correlacion
[1] 0.7931361
# Con 1 variable predictora
modelo<-lm(lpsa~lcavol,data=datos.prostata)

# Con la tabla de testing
pred<-predict(modelo,datos.prostata.nuevos)
# Se usa NumVar=1 porque tenemos 1 predictor
pre.lm <- indices.precision(datos.prostata.nuevos$lpsa,pred,1)
pre.lm
$error.cuadratico
[1] 1.229151

$raiz.error.cuadratico
[1] 1.239532

$error.relativo
[1] 0.3058011

$correlacion
[1] 0.711853
# Con 3 variables predictoras
modelo<-lm(lpsa~lcavol+svi+lweight,data=datos.prostata)

# Con la tabla de testing
pred<-predict(modelo,datos.prostata.nuevos)
# Se usa NumVar=3 porque tenemos 3 predictores
pre.lm <- indices.precision(datos.prostata.nuevos$lpsa,pred,3)
pre.lm
$error.cuadratico
[1] 0.9961928

$raiz.error.cuadratico
[1] 1.288535

$error.relativo
[1] 0.2934903

$correlacion
[1] 0.8063431